Introduction

In order to compare the ability of NRLB and DeepBind to explain HT-SELEX data, we need to bin observed probe counts using the scores \(S_i\) from either NRLB or DeepBind. These binned counts can be analyzed using the following methods:

Enrichment

We can compare the ratios of binned R0 and R1 reads in every score bin \(j\) by assuming \[ \frac{R1_j}{R0_j} = \exp^{S_i-c}\] where \(c\) is a scaling factor incorporating library size and other unaccounted effects. \(c\) is found by minimizing the RMSE.

Load information:

library(ggplot2)
library(scales)
library(gridExtra)
library(grid)
setwd("~/Desktop/dbsampling/")
#setwd("~/Documents/Research/SELEX/MultinomialPaper/ModelComparisons/dbsampling/")
info = read.table("dbIDs.tsv", header=TRUE, stringsAsFactors=FALSE)

Some helper functions below:

integrator = function(input) {
  sum = 0
  x = input[,1]
  y = input[,2]
  for (i in 2:length(y)) {
    sum = sum + (y[i]+y[i-1])/2*(x[i]-x[i-1])
  }
  return(sum)
}

bin.scores = function(counts, scores, breaks) {
  idx = counts!=0
  counts = counts[idx]
  scores = scores[idx]
  
  out.scores = scores
  for (i in 2:max(counts)) {
    for (j in 2:i) {
      out.scores = c(out.scores, scores[counts==i])
    }
  }
  hist.counts = hist(x = out.scores, breaks=breaks, plot=FALSE)$counts
  return(hist.counts)
}

compute.enrichment = function(R0, R1) {
  Enrichment = data.frame(Score=R1$Score,
                           Enrichment=log(R1$BinnedCounts/R0$BinnedCounts),
                           MinCount=do.call(pmin, as.data.frame(cbind(R1$BinnedCounts, R0$BinnedCounts))))
  Enrichment = Enrichment[complete.cases(Enrichment),]
  Enrichment = Enrichment[Enrichment$Enrichment!=Inf,]
  Enrichment = Enrichment[Enrichment$Enrichment!=-Inf,]
  #Adjust to minimize sum of squared errors
  offset = optimize(f = function(x) sum((Enrichment$Score-Enrichment$Enrichment+x)^2), 
                    interval = c(-40,40))
  Enrichment$Enrichment = Enrichment$Enrichment-offset$minimum
  RMSE = sqrt(offset$objective/nrow(Enrichment))
  temp = Enrichment[Enrichment$MinCount>50,]
  RMSE.50 = sqrt(sum((temp$Score-temp$Enrichment)^2)/nrow(temp))
  Range = range(Enrichment[,1:2])
  Range[1] = floor(Range[1])
  Range[2] = ceiling(Range[2])
  return(list(Enrichment=Enrichment, RMSE=RMSE, RMSE50=RMSE.50, Range=Range))
}

compute.means = function(counts, density) {
  #Compute Means
  Means = NULL
  for (i in 2:nrow(counts)) {
    Means = c(Means, counts$BinnedCounts[i]/
                ((density$ProbeCounts[i]+density$ProbeCounts[i-1])*(density$Score[i]-density$Score[i-1])/2))
  }
  Means = log10(Means/sum(counts$BinnedCounts))
  #Compute a Z
  Z = integrator(input = cbind(density$Score, exp(density$Score)*density$ProbeCounts))
  #Compute observation frequency
  Frequency = NULL
  for (i in 2:nrow(counts)) {
    Frequency = c(Frequency, (exp(density$Score[i])+exp(density$Score[i-1]))/(2*Z))
  }
  Frequency = log10(Frequency)
  Means = data.frame(Frequency, Means, MinCount=counts$BinnedCounts[2:nrow(counts)])
  Means = Means[complete.cases(Means),]
  Means = Means[Means$Means!=Inf,]
  Means = Means[Means$Means!=-Inf,]
  RMSE = sqrt(sum((Means$Frequency-Means$Means)^2)/nrow(Means))
  temp = Means[Means$MinCount>50,]
  RMSE.50 = sqrt(sum((temp$Frequency-temp$Means)^2)/nrow(temp))
  Range = range(Means[,1:2])
  Range[1] = floor(Range[1])
  Range[2] = ceiling(Range[2])
  return(list(Means=Means, RMSE=RMSE, RMSE50 = RMSE.50, Range=Range))
}

roc = function(scores, labels) {
  breaks = scores[order(scores)]
  P = sum(labels>0)
  N = sum(labels==0)
  
  FP.subset = scores[labels==0]
  TP.subset = scores[labels>0]
  
  FP = sapply(breaks, function(x) sum(FP.subset>=x))
  TP = sapply(breaks, function(x) sum(TP.subset>=x))
  
  return(data.frame(FalsePositive=FP/N, TruePositive=TP/P, Threshold=breaks))
}

means.plot = function(df) {
  r = df$Range
  out = ggplot(df$Means, aes(x=Frequency, y=Means, color=MinCount)) +
    coord_fixed(xlim = r, ylim = r, ratio=1) +
    theme_bw() +
    geom_abline(size=1, lty=2, col="grey") + 
    geom_point(alpha=.65) +
    scale_color_gradient("Counts", low="red", high="blue", limits=c(0,100), oob=squish) +
    xlab(expression(paste(log[10]," Predicted Sequencing Rate (per probe)"))) +
    ylab(expression(paste(log[10]," Observed Sequencing Rate (per probe)"))) + 
    annotate("text", x=r[1], y=r[2], 
             label=paste0("RMSE: ", round(df$RMSE, digits=3)), hjust=0, vjust=1) + 
    annotate("text", x=r[1], y=.95*r[2]+.05*r[1], 
             label=paste0("RMSE[50]: ", round(df$RMSE50, digits=3)), parse=TRUE, hjust=0, vjust=1) +  
  theme(legend.position=c(.9,.2), legend.title.align=.5)
  return(out)
}

enrichment.plot = function(df) {
  r = df$Range
  out = ggplot(df$Enrichment, aes(x=Score, y=Enrichment, color=MinCount)) +
    coord_fixed(xlim = r, ylim = r, ratio=1) +
    theme_bw() +
    geom_abline(size=1, lty=2, col="grey") + 
    geom_point(alpha=.65) +
    scale_color_gradient("Counts", low="red", high="blue", limits=c(0,100), oob=squish) +
    ylab("Log Enrichment") +
    xlab("Score") +   
    annotate("text", x=r[1], y=r[2], 
             label=paste0("RMSE: ", round(df$RMSE, digits=3)), hjust=0, vjust=1) + 
    annotate("text", x=r[1], y=.95*r[2]+.05*r[1], 
             label=paste0("RMSE[50]: ", round(df$RMSE50, digits=3)), parse=TRUE, hjust=0, vjust=1) +
  theme(legend.position=c(.9,.2), legend.title.align=.5)
  return(out)
}

density.plots = function(DOS, R0, R1) {
  divSize = DOS$Score[2]-DOS$Score[1]
  r = range(c(range(DOS$Score[DOS$ProbeCounts!=0]), range(DOS$Score[DOS$ProbeCounts!=0])))
  df = rbind(data.frame(Score = DOS$Score, Density = DOS$ProbeCounts/integrator(DOS), Type="DOS"),
             data.frame(Score = DOS$Score, Density = R0$BinnedCounts/sum(R0$BinnedCounts)/divSize, Type="R0 Counts"),
             data.frame(Score = DOS$Score, Density = R1$BinnedCounts/sum(R1$BinnedCounts)/divSize, Type="R1 Counts"))
  out = ggplot(df, aes(x = Score, y = Density, color = Type)) +
    coord_fixed(xlim = r) + 
    theme_bw() +
    geom_path(aes(linetype = Type)) + 
    scale_linetype_manual(values=c(1, 2, 2)) + 
    scale_color_manual(values=c("darkgrey", "red", "blue")) + 
    theme(legend.position=c(.85,.7), legend.title.align=.5, aspect.ratio = 1)
  return(out)
}

rescaled.dos = function(dos) {
  sorted.vals = unique(sort(dos)) 
  for (currCutoff in sorted.vals) {
    cloned.dos = dos
    cloned.dos[cloned.dos<currCutoff] = 0
    cloned.dos[cloned.dos>=currCutoff] = exp(cloned.dos[cloned.dos>=currCutoff] - currCutoff + .01)
    if (sum(range(cloned.dos) %in% Inf) == 0) {
      break
    }
  }
  return(cloned.dos)
}

multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

Analysis over all proteins

AllEnrichments = NULL
AllMeans = NULL
AllROCs = NULL
Names = NULL
for (i in 1:30) { #nrow(info)) {
  protName = info$Name[i]
  count.file = paste0("CountScores/", protName, "_R0R1.tsv")
  score.file = paste0("CountScores/", protName, "_scores.tsv")
  dbdensity.file = paste0("DBDensities/",protName,".tsv")
  nrlbdensity.file = paste0("NRLBDensities/",protName,".tsv")
  #Check to see if count/score files exist
  if(!(file.exists(count.file) && file.exists(score.file) && file.exists(dbdensity.file))) {
    next
  }
  if (file.info(dbdensity.file)$size==0) {
    next
  }
  cat(paste0("  \n####",protName," \n"))
  Names = c(Names, protName)
  #Load all files
  counts = read.table(count.file, header = FALSE, stringsAsFactors = FALSE)
  names(counts) = c("Sequence", "R0Count", "R1Count")
  scores = read.table(score.file, header = TRUE, stringsAsFactors = FALSE)
  DOS.DB  = read.table(dbdensity.file, header = FALSE, stringsAsFactors = FALSE)
  DOS.NRLB= read.table(nrlbdensity.file, header = FALSE, stringsAsFactors = FALSE)
  
  #Compute DB/NRLB distribution and reset energies
  dbMax = max(DOS.DB$V1[DOS.DB$V2!=0])
  nrlbMax = max(DOS.NRLB$V1[DOS.NRLB$V2!=0])
  DOS.DB$V2 = rescaled.dos(DOS.DB$V2)
  DOS.NRLB$V2 = rescaled.dos(DOS.NRLB$V2)
  Density.DB = data.frame(Score=DOS.DB$V1-dbMax, ProbeCounts=(DOS.DB$V2*4^info$VarLen[i])/integrator(DOS.DB))
  Density.NRLB=data.frame(Score=DOS.NRLB$V1-nrlbMax, ProbeCounts=(DOS.NRLB$V2*4^info$VarLen[i])/integrator(DOS.NRLB))
  scores$DeepBind = scores$DeepBind - dbMax
  scores$NRLB = scores$NRLB - nrlbMax
  #TODO: Optimal histogram spacing?
  
  #Bin counts
  R0.DB = data.frame(Score=Density.DB$Score, 
                    BinnedCounts=c(0, bin.scores(counts = counts$R0Count, 
                                                 scores = scores$DeepBind, breaks = Density.DB$Score)))
  R1.DB = data.frame(Score=Density.DB$Score, 
                    BinnedCounts=c(0, bin.scores(counts = counts$R1Count, 
                                                 scores = scores$DeepBind, breaks = Density.DB$Score)))
  R0.NRLB = data.frame(Score=Density.NRLB$Score, 
                    BinnedCounts=c(0, bin.scores(counts = counts$R0Count, 
                                                 scores = scores$NRLB, breaks = Density.NRLB$Score)))
  R1.NRLB = data.frame(Score=Density.NRLB$Score, 
                    BinnedCounts=c(0, bin.scores(counts = counts$R1Count, 
                                                 scores = scores$NRLB, breaks = Density.NRLB$Score)))
  
  #Plot densities
  p1 = density.plots(Density.NRLB, R0.NRLB, R1.NRLB) + ggtitle("NRLB Densities")
  p2 = density.plots(Density.DB, R0.DB, R1.DB) + ggtitle("DeepBind Densities")
  multiplot(p1, p2, cols=2)
  #grid.arrange(p1, p2, ncol = 2)
  
  #Compute log enrichment for NRLB and DB and plot them
  NRLB.enrichment = compute.enrichment(R0.NRLB, R1.NRLB)
  DB.enrichment = compute.enrichment(R0.DB, R1.DB)
  p.nrlb = enrichment.plot(NRLB.enrichment) + ggtitle("NRLB Enrichment")
  p.db = enrichment.plot(DB.enrichment) + ggtitle("DeepBind Enrichment")
  multiplot(p.nrlb, p.db, cols=2)
  #grid.arrange(p.nrlb, p.db, ncol = 2)
  AllEnrichments = rbind(AllEnrichments, c(NRLB.enrichment$RMSE50, DB.enrichment$RMSE50))
  
  #Compute means/poisson rate
  NRLB.means = compute.means(R1.NRLB, Density.NRLB)
  DB.means = compute.means(R1.DB, Density.DB)
  p.nrlb = means.plot(NRLB.means) + ggtitle("NRLB Poisson Means")
  p.db = means.plot(DB.means) + ggtitle("DeepBind Poisson Means")
  multiplot(p.nrlb, p.db, cols=2)
  #grid.arrange(p.nrlb, p.db, ncol = 2)
  
  #Compute optimal means
  NRLB.opt = read.table(paste0("NRLBPoissonDensities/", protName, ".tsv"), header=FALSE, stringsAsFactors = FALSE)
  Means = data.frame(Frequency=NRLB.opt$V1, Means = log10((NRLB.opt$V3/NRLB.opt$V2)/sum(NRLB.opt$V3)), MinCount=NRLB.opt$V3)
  Means = Means[complete.cases(Means),]
  Means = Means[Means$Means!=Inf,]
  Means = Means[Means$Means!=-Inf,]
  RMSE = sqrt(sum((Means$Frequency-Means$Means)^2)/nrow(Means))
  temp = Means[Means$MinCount>50,]
  RMSE.50 = sqrt(sum((temp$Frequency-temp$Means)^2)/nrow(temp))
  if (is.nan(RMSE.50)) {
    RMSE.50 = RMSE
  }
  Range = range(Means[,1:2])
  Range[1] = floor(Range[1])
  Range[2] = ceiling(Range[2])
  NRLB.opt = list(Means=Means, RMSE=RMSE, RMSE50 = RMSE.50, Range=Range)
  AllMeans = rbind(AllMeans, c(NRLB.opt$RMSE50, NRLB.means$RMSE50, DB.means$RMSE50))
  p1 = means.plot(NRLB.opt) + ggtitle("NRLB Optimal Poisson Means")
  
  #Compute ROC Curves
  categories = read.table(paste0("CountScores/",protName,"_ROC.tsv"), header = FALSE, stringsAsFactors = FALSE)
  scores = read.table(paste0("CountScores/",protName,"_ROC_scores.tsv"), header=TRUE, stringsAsFactors = FALSE)
  ROC.DB = roc(scores = scores$DeepBind, labels = categories$V2)
  ROC.NRLB = roc(scores = scores$NRLB, labels = categories$V2)
  AUROC.DB = -integrator(ROC.DB)
  AUROC.NRLB = -integrator(ROC.NRLB)
  AllROCs = rbind(AllROCs, c(AUROC.NRLB, AUROC.DB))
  
  #Plot
  df = rbind(cbind(ROC.DB, Method="DeepBind"), cbind(ROC.NRLB, Method="NRLB"))
  p2 = ggplot(df, aes(x=FalsePositive, y=TruePositive, color=Method)) +
   coord_fixed(ratio = 1, xlim=c(0,1), ylim=c(0,1)) +
   geom_path(size=.5, show.legend = TRUE) +
   theme_bw() +
   expand_limits(y=0) +
   scale_colour_manual(values=c("black", "red"), labels=c(paste0("DeepBind: ", round(AUROC.DB, 3)),
                                                          paste0("NRLB: ", round(AUROC.NRLB, 3)))) +
   geom_abline(slope=1, intercept = 0, linetype=2, alpha=.5, size=1) +
   xlab("False Positive Rate") +
   ylab("True Positive Rate") +
   ggtitle("ROC Curve") +
   theme(legend.position=c(.85,.2), legend.title.align=.5)
  multiplot(p1, p2, cols=2)
  #grid.arrange(p1, p2, ncol = 2)
  cat("  \n")
}

CEBPB

CEBPD

EBF1

EGR1

ELK1

ETS1

GABPA

GATA3

IRF4

MAFF

MAFK

MAX

MEF2A

NFATC1

NFE2

NR2C2

NRF1

PAX5

POU2F2

POU5F1P1

RFX5

RUNX3

TFAP2A

TFAP2C

USF1

YY1

ZNF143

Compiled Results

AllROCs = data.frame(AllROCs)
names(AllROCs) = c("NRLB", "DeepBind")
AllROCs = cbind(AllROCs, Names)
AllMeans = data.frame(AllMeans)
names(AllMeans) = c("NRLBOpt", "NRLB", "DeepBind")
AllMeans = cbind(AllMeans, Names)
AllEnrichments = data.frame(AllEnrichments)
names(AllEnrichments) = c("NRLB", "DeepBind")
AllEnrichments = cbind(AllEnrichments, Names)

r = range(AllEnrichments[,1:2])
ggplot(AllEnrichments, aes(x=NRLB, y=DeepBind, label=Names)) + 
  coord_fixed(ratio = 1, xlim=r, ylim=r) + 
  geom_abline(slope=1, intercept = 0, linetype=2, alpha=.5, size=.5) + 
  ggtitle("All Enrichments") +
  theme_bw() + 
  geom_point() + 
  geom_text(check_overlap = TRUE, vjust=1, hjust=0, nudge_x = 0.025*(r[2]-r[1])) +
  xlab("NRLB RMSE") +
  ylab("DeepBind RMSE")

r = range(AllROCs[,1:2])
ggplot(AllROCs, aes(y=NRLB, x=DeepBind, label=Names)) + 
  coord_fixed(ratio = 1, xlim=r, ylim=r) + 
  geom_abline(slope=1, intercept = 0, linetype=2, alpha=.5, size=.5) + 
  ggtitle("All ROCs") +
  theme_bw() + 
  geom_point() + 
  geom_text(check_overlap = TRUE, vjust=1, hjust=0, nudge_x = 0.025*(r[2]-r[1])) +
  ylab("NRLB ROC") +
  xlab("DeepBind ROC")

r = range(AllMeans[,1:3])
ggplot(AllMeans, aes(x=NRLB, y=DeepBind, color=NRLBOpt, label=Names)) + 
  coord_fixed(ratio = 1, xlim=r, ylim=r) + 
  geom_abline(slope=1, intercept = 0, linetype=2, alpha=.5, size=.5) + 
  ggtitle("All Poisson Means") +
  theme_bw() + 
  geom_point() + 
  scale_color_gradient("Optimal\nRMSE", low="blue", high="red", limits=r, oob=squish) + 
  geom_text(check_overlap = TRUE, vjust=1, hjust=0, nudge_x = 0.025*(r[2]-r[1]), color="black") +
  xlab("NRLB RMSE") +
  ylab("DeepBind RMSE")